{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bce5cb53",
   "metadata": {},
   "source": [
    "### Example for running PII detection and anonymization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b66801c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "from pii_detection import scan_pii_batch\n",
    "from pii_redaction import redact_pii_batch, random_replacements\n",
    "\n",
    "ds = load_dataset(\"bigcode/pii-for-code\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f76c9e5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_pii = ds.map(scan_pii_batch, batched=True, batch_size=100, num_proc=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "06d15f83",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset after PII detection:\n",
      "Dataset({\n",
      "    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id', 'secrets', 'has_secrets', 'number_secrets'],\n",
      "    num_rows: 400\n",
      "})\n",
      "Number of samples that contained PII: 211\n",
      "Total number of secrets found: 336\n"
     ]
    }
   ],
   "source": [
    "print(f\"Dataset after PII detection:\\n{ds_pii}\")\n",
    "print(f\"Number of samples that contained PII: {sum(ds_pii['has_secrets'])}\")\n",
    "print(f\"Total number of secrets found: {sum(ds_pii['number_secrets'])}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b54c5044",
   "metadata": {},
   "source": [
    "#### About the detection and anonymization:\n",
    "* we detect secret keys with detect-secrets and mask them with keys from these 4 randomly generated sequences -they can change in each execution on a new dataset-: \n",
    "        ```\n",
    "        ['q8jtgev49gw1un9427qd9afza5vpuemo',\n",
    "        'pj82ffu65gt9sh9v8n9s2fyupslmlcq4',\n",
    "        'efijcf8z7r7pn0r25wfuh5vmpbrhoxkv',\n",
    "        '1dgjoc8ebhmhzfxhcbmlh4ndb81gqeoe']\n",
    "        ```\n",
    "        \n",
    "* we detect email addresses and mask them with one of these 4 emails (first part was randomly generated) -they can change in each execution on a new dataset-:\n",
    "        ```\n",
    "        ['mynbi@email.com',\n",
    "        'qpmzj@email.com',\n",
    "        'plsgq@email.com',\n",
    "        'ejeyd@email.com']\n",
    "        ```\n",
    "\n",
    "* we detect IP addresses (and DNS servers) and mask them with the random private addresses below (they are fixed). Note that private IP addresses aren't masked (we use `ipaddress` python library to determine if they are private or not):\n",
    "```\n",
    "{'IPv4': ['172.16.31.10',\n",
    "        '172.16.58.3',\n",
    "        '192.168.127.12',\n",
    "        '192.168.3.11'],\n",
    "'IPv6': ['fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b',\n",
    "                'fc00:e968:6179::de52:7100',\n",
    "                'fc00:db20:35b:7399::5',\n",
    "                'fdf8:f53e:61e4::18']},\n",
    "```\n",
    "\n",
    "Remarks:\n",
    "* If the same secret appears multiple times in a file, we use the same replacement each time.\n",
    "* To solve issue with dns servers being versions, we only detect an address in format x.x.x.x where x is one digit, if the words \"dns\" or \"sever\" appear in the near context."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "68669831",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'EMAIL': ['mynbi@email.com',\n",
      "           'qpmzj@email.com',\n",
      "           'plsgq@email.com',\n",
      "           'ejeyd@email.com'],\n",
      " 'IP_ADDRESS': {'IPv4': ['172.16.31.10',\n",
      "                         '172.16.58.3',\n",
      "                         '192.168.127.12',\n",
      "                         '192.168.3.11'],\n",
      "                'IPv6': ['fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b',\n",
      "                         'fc00:e968:6179::de52:7100',\n",
      "                         'fc00:db20:35b:7399::5',\n",
      "                         'fdf8:f53e:61e4::18']},\n",
      " 'KEY': ['q8jtgev49gw1un9427qd9afza5vpuemo',\n",
      "         'pj82ffu65gt9sh9v8n9s2fyupslmlcq4',\n",
      "         'efijcf8z7r7pn0r25wfuh5vmpbrhoxkv',\n",
      "         '1dgjoc8ebhmhzfxhcbmlh4ndb81gqeoe']}\n"
     ]
    }
   ],
   "source": [
    "# redaction\n",
    "import random\n",
    "from pprint import pprint\n",
    "random.seed(0)\n",
    "\n",
    "replacements = random_replacements()\n",
    "pprint(replacements)\n",
    "ds_redacted = ds_pii.map(lambda x: redact_pii_batch(x, replacements), batched=True, batch_size=100, num_proc=12, load_from_cache_file=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e060ed7e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['content', 'language', 'license', 'path', 'annotation_id', 'pii', 'pii_modified', 'id', 'secrets', 'has_secrets', 'number_secrets', 'new_content', 'redaction_refs'],\n",
       "    num_rows: 400\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds_redacted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "294a9083",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "for e in ds_redacted:\n",
    "    secrets = json.loads(e[\"secrets\"])\n",
    "    if len(secrets) >= 3:\n",
    "        print(e[\"id\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "259f9759",
   "metadata": {},
   "source": [
    "example 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5a37524",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_redacted[16][\"secrets\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04f7e74d",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "print(ds_redacted[16][\"content\"][1190:1500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "470bf3aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"New text:\")\n",
    "print(ds_redacted[16][\"new_content\"][1190:1500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "897b7ebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"New text with delimietrs (for visualization in a space):\")\n",
    "print(ds_redacted[16][\"redaction_refs\"][1190:1500])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39e051da",
   "metadata": {},
   "source": [
    "example 27"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c129f763",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_redacted[27][\"secrets\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35977e2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "# we don't replace private Ips like 0.0.0.0\n",
    "print(ds_redacted[27][\"content\"][150:250])\n",
    "\n",
    "print(\"\\nNew text:\")\n",
    "print(ds_redacted[27][\"new_content\"][150:250])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d081f2ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "print(ds_redacted[27][\"content\"][270:670])\n",
    "\n",
    "print(\"\\nNew text:\")\n",
    "# here the first part of the key was detected and replaced with pj82ffu65gt9sh9v8n9s2fyupslmlcq\n",
    "print(ds_redacted[27][\"new_content\"][270:470])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0661335f",
   "metadata": {},
   "source": [
    "example 49"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f332863",
   "metadata": {},
   "outputs": [],
   "source": [
    "ds_redacted[49][\"secrets\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e2248f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Old text:\")\n",
    "print(ds_redacted[49][\"content\"][30:70])\n",
    "\n",
    "print(\"\\nNew text:\")\n",
    "# here the first part of the key was detected and replaced with pj82ffu65gt9sh9v8n9s2fyupslmlcq\n",
    "print(ds_redacted[49][\"new_content\"][30:70])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.4 ('venv')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "vscode": {
   "interpreter": {
    "hash": "fd8fde6f83dada9276d12fdb71d773558994168ed1b3bea457b8db38c02aa2e1"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
